Libraries
suppressPackageStartupMessages({
library(data.table)
library(DESeq2)
library(gplots)
library(here)
library(hyperSpec)
library(matrixStats)
library(parallel)
library(pander)
library(plotly)
library(RColorBrewer)
library(scatterplot3d)
library(tidyverse)
library(tximport)
library(vsn)
})
Helper functions
source(here("UPSCb-common/src/R/plot.multidensity.R"))
source(here("UPSCb-common/src/R/featureSelection.R"))
Graphics
#pal <- brewer.pal(8,"Dark2")
pal <- brewer.pal(10,"Paired")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
Metadata Sample information
samples <- read_csv(here("doc/samples.csv"))
## Parsed with column specification:
## cols(
## ScilifeID = col_character(),
## SubmittedID = col_character(),
## Stages = col_character(),
## Description = col_character()
## )
samples$ID <- sub(".*/","",sub("_L00[1,2]","",sub("[1,2]_[1].*_.*DXX_","", sub("_dual.*","",samples$ScilifeID))))
samples$Batch <- factor(sprintf("B%d",grepl("P7614",samples$ScilifeID)+1))
filelist <- list.files(here("data/Salmon"),
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)
Sanity check to ensure that the data is sorted according to the sample info
filelist <- filelist[match(samples$ScilifeID,sub("_sortmerna.*","",basename(dirname(filelist))))]
stopifnot(all(match(sub("_sortmerna.*","",basename(dirname(filelist))),
samples$ScilifeID) == 1:nrow(samples)))
name the file list vector
names(filelist) <- samples$ID
Read the expression at the gene level
counts <- suppressMessages(round(tximport(files = filelist, type = "salmon",txOut=TRUE)$counts))
# For the PCA, Use the vst data instead and use colMedians or colMeans
tst <- sapply(split.data.frame(t(counts),
samples$Stages),
colSums)
Check how many genes are never expressed
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))
## [1] "5.6% percent (21573) of 387368 genes are not expressed"
Let us take a look at the sequencing depth, colouring by Stages
dat_s <- tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples)
ggplot(dat_s,aes(x,y,fill=samples$Stages)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90,size=4),axis.title.x=element_blank())
Display the per-gene mean expression
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
The cumulative gene coverage is as expected
ggplot(melt(log10(rowMeans(counts))),aes(x=value)) +
geom_density() + ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)")
## Warning in melt(log10(rowMeans(counts))): The melt generic in data.table
## has been passed a numeric and will attempt to redirect to the relevant
## reshape2 method; please note that reshape2 is deprecated, and this
## redirection is now deprecated as well. To continue using melt methods from
## reshape2 while both libraries are attached, e.g. melt.list, you can prepend
## the namespace like reshape2::melt(log10(rowMeans(counts))). In the next
## version, this warning will become an error.
## Warning: Removed 21573 rows containing non-finite values (stat_density).
The same is done for the individual samples colored by Batch.
#samples$Batch <- factor(sprintf("B%d",grepl("P7614",samples$ScilifeID)+1))
dat_b <- as.data.frame(log10(counts)) %>% utils::stack() %>%
mutate(Batch=samples$Batch[match(ind,samples$ID)]) %>%
mutate(Stages=samples$Stages[match(ind,samples$ID)])
ggplot(dat_b,aes(x=values,group=ind,col=Batch)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 17209420 rows containing non-finite values (stat_density).
ggplot(dat_b,aes(x=values,group=ind,col=Stages)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 17209420 rows containing non-finite values (stat_density).
dir.create(here("data/analysis/salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("data/analysis/salmon/raw-unormalised-gene-expression_data.csv"))
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate.
samples$Stages <- factor(samples$Stages)
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = samples,
design = ~ Batch+Stages)
## converting counts to integer mode
save(dds,file=here("data/analysis/salmon/dds.rda"))
Check the size factors (i.e. the sequencing library size effect)
dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
| P464_201B | P464_202 | P464_203 | P464_204 | P464_205 | P464_206 | P464_207B |
|---|---|---|---|---|---|---|
| 0.7277 | 1.521 | 1.491 | 1.364 | 1.049 | 1.373 | 1.167 |
| P464_208 | P464_209B | P464_211 | P7614_301_S1 | P7614_301_S1 | P7614_302_S2 |
|---|---|---|---|---|---|
| 1.268 | 0.5299 | 1.635 | 0.959 | 0.9556 | 0.96 |
| P7614_302_S2 | P7614_303_S3 | P7614_303_S3 | P7614_304_S4 | P7614_304_S4 |
|---|---|---|---|---|
| 0.9593 | 1.084 | 1.082 | 1.035 | 1.04 |
| P7614_305_S5 | P7614_305_S5 | P7614_306_S6 | P7614_306_S6 | P7614_307_S7 |
|---|---|---|---|---|
| 1.088 | 1.087 | 1.304 | 1.305 | 1.07 |
| P7614_307_S7 | P7614_308_S8 | P7614_308_S8 | P7614_309_S9 | P7614_309_S9 |
|---|---|---|---|---|
| 1.07 | 1.137 | 1.146 | 0.8997 | 0.8973 |
| P7614_310_S10 | P7614_310_S10 | P7614_311_S11 | P7614_311_S11 | P7614_312_S12 |
|---|---|---|---|---|
| 0.8148 | 0.8171 | 0.8993 | 0.9027 | 0.8269 |
| P7614_312_S12 | P7614_313_S13 | P7614_313_S13 | P7614_314_S14 | P7614_314_S14 |
|---|---|---|---|---|
| 0.8256 | 0.982 | 0.9788 | 0.9207 | 0.924 |
| P7614_315_S15 | P7614_315_S15 | P7614_316_S16 | P7614_316_S16 | P7614_317_S17 |
|---|---|---|---|---|
| 1.055 | 1.062 | 0.9558 | 0.9541 | 0.8551 |
| P7614_317_S17 | P7614_318_S18 | P7614_318_S18 | P7614_319_S19 | P7614_319_S19 |
|---|---|---|---|---|
| 0.8548 | 0.9647 | 0.9629 | 0.9735 | 0.9734 |
| P7614_320_S20 | P7614_320_S20 | P7614_321_S21 | P7614_321_S21 | P7614_322_S22 |
|---|---|---|---|---|
| 0.9145 | 0.9167 | 1.128 | 1.13 | 1.071 |
| P7614_322_S22 | P7614_323_S23 | P7614_323_S23 | P7614_324_S24 | P7614_324_S24 |
|---|---|---|---|---|
| 1.075 | 0.9609 | 0.9636 | 0.991 | 0.9911 |
boxplot(sizes, main="Sequencing libraries size factor")
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
dir.create(here("data/analysis/DE"))
## Warning in dir.create(here("data/analysis/DE")): '/mnt/picea/home/ccanovi/
## Git/lncRNAs/data/analysis/DE' already exists
save(vst,file=here("data/analysis/DE/vst-blind.rda"))
vsda <- varianceStabilizingTransformation(dds, blind=FALSE)
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
vsta <- assay(vsda)
vsta <- vsta - min(vsta)
save(vsta,file=here("data/analysis/DE/vst-aware.rda"))
Validation
The variance stabilisation worked adequately
meanSdPlot(vst[rowSums(vst)>0,])
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
Cumulative components effect
We define the number of variable of the model
nvar=2
nlevel=nlevels(dds$Stages) * nlevels(dds$Batch)
We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.
ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
scale_x_continuous("Principal component") +
geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)
mar=c(5.1,4.1,4.1,2.1)
The PCA shows that a large fraction of the variance is explained by both variables.
scatterplot3d(pc$x[,1],
pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
color=pal[as.integer(dds$Stages)],
pch=c(17,19)[as.integer(dds$Batch)])
legend("topleft",
fill=pal[1:nlevels(dds$Stages)],
legend=levels(dds$Stages))
legend("topright",
pch=c(17,19),
legend=levels(dds$Batch))
par(mar=mar)
pc.dat <- cbind(pc$x,samples)
p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=dds$Stages,shape=dds$Batch,text=dds$ID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))
p <- ggplot(pc.dat,aes(x=PC2,y=PC3,col=dds$Stages,shape=dds$Batch,text=dds$ID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p) %>%
layout(xaxis=list(title=paste("PC2 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC3 (",percent[2],"%)",sep="")))
p <- ggplot(pc.dat,aes(x=PC1,y=PC3,col=dds$Stages,shape=dds$Batch,text=dds$ID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC3 (",percent[2],"%)",sep="")))
p <- ggplot(pc.dat,aes(x=PC1,y=PC4,col=dds$Stages,shape=dds$Batch,text=dds$ID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC4 (",percent[2],"%)",sep="")))
p <- ggplot(pc.dat,aes(x=PC2,y=PC4,col=dds$Stages,shape=dds$Batch,text=dds$ID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p) %>%
layout(xaxis=list(title=paste("PC2 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC4 (",percent[2],"%)",sep="")))
Filter for noise
conds <- factor(paste(samples$Stages,samples$Batch))
sels <- rangeFeatureSelect(counts=vst,
conditions=conds,
nrep=3)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted
## from logarithmic plot
vst.cutoff <- 10
Heatmap of “all” genes
hm <- heatmap.2(t(scale(t(vst[sels[[vst.cutoff+1]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
plot(as.hclust(hm$colDendrogram),xlab="",sub="")
CHANGEME
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] vsn_3.52.0 tximport_1.12.3
## [3] forcats_0.4.0 stringr_1.4.0
## [5] dplyr_0.8.3 purrr_0.3.3
## [7] readr_1.3.1 tidyr_1.0.0
## [9] tibble_2.1.3 tidyverse_1.2.1
## [11] scatterplot3d_0.3-41 RColorBrewer_1.1-2
## [13] plotly_4.9.0 pander_0.6.3
## [15] hyperSpec_0.99-20180627 ggplot2_3.2.1
## [17] lattice_0.20-38 here_0.1
## [19] gplots_3.0.1.1 DESeq2_1.24.0
## [21] SummarizedExperiment_1.14.1 DelayedArray_0.10.0
## [23] BiocParallel_1.18.1 matrixStats_0.55.0
## [25] Biobase_2.44.0 GenomicRanges_1.36.1
## [27] GenomeInfoDb_1.20.0 IRanges_2.18.3
## [29] S4Vectors_0.22.1 BiocGenerics_0.30.0
## [31] data.table_1.12.6
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 rprojroot_1.3-2 htmlTable_1.13.2
## [4] XVector_0.24.0 base64enc_0.1-3 rstudioapi_0.10
## [7] hexbin_1.27.3 affyio_1.54.0 bit64_0.9-7
## [10] AnnotationDbi_1.46.1 lubridate_1.7.4 xml2_1.2.2
## [13] splines_3.6.1 geneplotter_1.62.0 knitr_1.25
## [16] zeallot_0.1.0 Formula_1.2-3 jsonlite_1.6
## [19] broom_0.5.2 annotate_1.62.0 cluster_2.1.0
## [22] shiny_1.4.0 BiocManager_1.30.9 compiler_3.6.1
## [25] httr_1.4.1 backports_1.1.5 fastmap_1.0.1
## [28] assertthat_0.2.1 Matrix_1.2-17 lazyeval_0.2.2
## [31] limma_3.40.6 cli_1.1.0 later_1.0.0
## [34] acepack_1.4.1 htmltools_0.4.0 tools_3.6.1
## [37] affy_1.62.0 gtable_0.3.0 glue_1.3.1
## [40] GenomeInfoDbData_1.2.1 reshape2_1.4.3 Rcpp_1.0.2
## [43] cellranger_1.1.0 vctrs_0.2.0 preprocessCore_1.46.0
## [46] gdata_2.18.0 nlme_3.1-141 crosstalk_1.0.0
## [49] xfun_0.10 testthat_2.2.1 rvest_0.3.4
## [52] mime_0.7 lifecycle_0.1.0 gtools_3.8.1
## [55] XML_3.98-1.20 zlibbioc_1.30.0 scales_1.0.0
## [58] promises_1.1.0 hms_0.5.1 yaml_2.2.0
## [61] memoise_1.1.0 gridExtra_2.3 rpart_4.1-15
## [64] latticeExtra_0.6-28 stringi_1.4.3 RSQLite_2.1.2
## [67] highr_0.8 genefilter_1.66.0 checkmate_1.9.4
## [70] caTools_1.17.1.2 rlang_0.4.1 pkgconfig_2.0.3
## [73] bitops_1.0-6 evaluate_0.14 labeling_0.3
## [76] htmlwidgets_1.5.1 bit_1.1-14 tidyselect_0.2.5
## [79] plyr_1.8.4 magrittr_1.5 R6_2.4.0
## [82] generics_0.0.2 Hmisc_4.2-0 DBI_1.0.0
## [85] pillar_1.4.2 haven_2.1.1 foreign_0.8-72
## [88] withr_2.1.2 survival_2.44-1.1 RCurl_1.95-4.12
## [91] nnet_7.3-12 modelr_0.1.5 crayon_1.3.4
## [94] KernSmooth_2.23-16 rmarkdown_1.16 locfit_1.5-9.1
## [97] readxl_1.3.1 blob_1.2.0 digest_0.6.22
## [100] xtable_1.8-4 httpuv_1.5.2 munsell_0.5.0
## [103] viridisLite_0.3.0